In [1]:
from IPython.display import HTML
In [2]:
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')
Out[2]:
The raw code for this IPython notebook is by default hidden for easier reading. To toggle on/off the raw code, click here.

Preventing accidents (in France)

Overall idea and required video content: how to prevent future accidents in France?

  • Motivation: save lives, increase traffic safety, decrease monetary expenses in society
  • Data needed: information about previous accidents in France, when where what happened, who, weather, temporal, spatial etc.
  • include visualizations
  • genre: combined genres. Annotated charts included in a 'slideshow'-thingy, e.g. different tabs on the website exploring/telling different stories in the data. Narrative is fixed as in a slideshow, but the desire is to include interactive charts allowing user exploration, while including our own take-aways (annotated chart style). - Why this combination of genres: mix between user engagement and a predetermined narrative ensures the user goes through the story, but has the option to engage themselves. If a user does not want to explore on his own, he has been enriched and hopefully learned something new. Annotated charts are included since they have the flexibility to convey our narrative while giving the user the option to investigte themselves.
  • metadata: 5 merged datasets after data preperation reduced to 582505 observations, 26 features. 120+ MB. No one-hot encoding. Features: weather, sex, age, vehicle, location, severity, road type etc.
  • include intro statistics: geo spacial data + plot, time-series plot

Ideer til descriptive

  • geografi
  • fordeling af hvor slemme ulykkerne er
  • time dependency (seasonality?). Timer, day of week, month, year etc.
  • choropleth pÃ¥ communes/departements...
  • aldersgrupper af victims
  • stack chart med dagslys og severity af ulykken
In [3]:
# Start writing code here...
In [4]:
# Define imports
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns
import pandas as pd
import folium
import plotly.express as px


# fix random generator seed (for reproducibility of results)
np.random.seed(42)

%matplotlib inline
sns.set_style('darkgrid')

Load data

Load data for 2018

In [5]:
char_2018 = pd.read_csv('work/Dataset/caracteristiques-2018.csv',usecols = ['Num_Acc','an','mois','jour','hrmn','lum','atm','com','lat','long','col'])
places_2018 = pd.read_csv('work/Dataset/lieux-2018.csv',usecols=['Num_Acc','catr','nbv','prof','plan','surf','infra','situ'])
users_2018 = pd.read_csv('work/Dataset/usagers-2018.csv',usecols=['Num_Acc','catu','grav','sexe','trajet','an_nais','secu'])
vehicles_2018 = pd.read_csv('work/Dataset/vehicules-2018.csv',usecols=['Num_Acc','catv'])

Load data from 2017

In [6]:
char_2017 = pd.read_csv('work/Dataset/caracteristiques-2017.csv',usecols = ['Num_Acc','an','mois','jour','hrmn','lum','atm','com','lat','long','col'])
places_2017 = pd.read_csv('work/Dataset/lieux-2017.csv',usecols=['Num_Acc','catr','nbv','prof','plan','surf','infra','situ'])
users_2017 = pd.read_csv('work/Dataset/usagers-2017.csv',usecols=['Num_Acc','catu','grav','sexe','trajet','an_nais','secu'])
vehicles_2017 = pd.read_csv('work/Dataset/vehicules-2017.csv',usecols=['Num_Acc','catv'])

Load data from 2016

In [7]:
char_2016 = pd.read_csv('work/Dataset/caracteristiques_2016.csv',usecols = ['Num_Acc','an','mois','jour','hrmn','lum','atm','com','lat','long','col'])
places_2016 = pd.read_csv('work/Dataset/lieux_2016.csv',usecols=['Num_Acc','catr','nbv','prof','plan','surf','infra','situ'])
users_2016 = pd.read_csv('work/Dataset/usagers_2016.csv',usecols=['Num_Acc','catu','grav','sexe','trajet','an_nais','secu'])
vehicles_2016 = pd.read_csv('work/Dataset/vehicules_2016.csv',usecols=['Num_Acc','catv'])

Merge data

In [8]:
# 2018 data
data_2018 = pd.merge(users_2018, char_2018, on='Num_Acc')
data_2018 = pd.merge(data_2018, places_2018, on='Num_Acc')
data_2018 = pd.merge(data_2018, vehicles_2018, on='Num_Acc')
data_2018 = data_2018.dropna()
print("The merged dataset now contains: ", len(data_2018), "observations.")

# 2017 data
data_2017 = pd.merge(users_2017, char_2017, on='Num_Acc')
data_2017 = pd.merge(data_2017, places_2017, on='Num_Acc')
data_2017 = pd.merge(data_2017, vehicles_2017, on='Num_Acc')
data_2017 = data_2017.dropna()
print("The merged dataset now contains: ", len(data_2017), "observations.")

# 2016 data
data_2016 = pd.merge(users_2016, char_2016, on='Num_Acc')
data_2016 = pd.merge(data_2016, places_2016, on='Num_Acc')
data_2016 = pd.merge(data_2016, vehicles_2016, on='Num_Acc')
data_2016 = data_2016.dropna()
print("The merged dataset now contains: ", len(data_2016), "observations.")

# Concatenate data sets
data = pd.concat([data_2018,data_2017],axis=0)
data = pd.concat([data,data_2016],axis=0)

# Set the year to eg 2018 instead of 18
data.an = data.an+2000

data['latS'] = data.lat/10**5
data['longS'] = data['long']/10**5
data = data[data['latS'] >= 40]
data = data[data['latS'] <= 51.25]
data = data[data['longS'] >= -5]
data = data[data['longS'] <= 9.8]

names = {'an': 'year', 'grav': 'severity', 'sexe': 'gender', 'an_nais': 'birth', 'jour': 'day', 'hrmn': 'HHMM',
         'catr': 'roadtype', 'mois': 'month','catu': 'User category', 'trajet': 'trip purpose','secu': 'safety', 
         'nbv': 'traffic lanes', 'surf': 'surface condition', 'infra': 'infrastructure', 'situ': 'situation',
         'catv': 'vehicle category','col': 'collision_type'}
data = data.rename(columns = names, inplace = False)

# Set the HH/mm to eg 0012 instead of 12
data['HHMM'] = data.HHMM.apply(lambda x: '000'+str(x) if (len(str(x))==1) else '00'+str(x) if (len(str(x))==2) else '0'+str(x) if (len(str(x))==3) else str(x))
data['minute'] = data.HHMM.apply(lambda x: x[2:])
data['hour'] = data.HHMM.apply(lambda x: x[:2])

data.info()
The merged dataset now contains:  232570 observations.
The merged dataset now contains:  196921 observations.
The merged dataset now contains:  191475 observations.
<class 'pandas.core.frame.DataFrame'>
Int64Index: 582482 entries, 0 to 249209
Data columns (total 29 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Num_Acc            582482 non-null  int64  
 1   User category      582482 non-null  int64  
 2   severity           582482 non-null  int64  
 3   gender             582482 non-null  int64  
 4   trip purpose       582482 non-null  float64
 5   safety             582482 non-null  float64
 6   birth              582482 non-null  float64
 7   year               582482 non-null  int64  
 8   month              582482 non-null  int64  
 9   day                582482 non-null  int64  
 10  HHMM               582482 non-null  object 
 11  lum                582482 non-null  int64  
 12  atm                582482 non-null  float64
 13  collision_type     582482 non-null  float64
 14  com                582482 non-null  int64  
 15  lat                582482 non-null  float64
 16  long               582482 non-null  float64
 17  roadtype           582482 non-null  int64  
 18  traffic lanes      582482 non-null  float64
 19  prof               582482 non-null  float64
 20  plan               582482 non-null  float64
 21  surface condition  582482 non-null  float64
 22  infrastructure     582482 non-null  float64
 23  situation          582482 non-null  float64
 24  vehicle category   582482 non-null  int64  
 25  latS               582482 non-null  float64
 26  longS              582482 non-null  float64
 27  minute             582482 non-null  object 
 28  hour               582482 non-null  object 
dtypes: float64(15), int64(11), object(3)
memory usage: 133.3+ MB

Column including the age of the victims

In [9]:
data['age'] = (data.year - data.birth)

Replace numbers with names of categoies

In [10]:
# Replace weather condition index with weather condition name
num2atm = {1: 'Normal ', 2: 'Light Rain', 3: 'Heavy Rain', 4: 'Snow - hail', 5: 'Fog - smoke', 6: 'Strong wind - storm', 7: 'Dazzling weather', 8: 'Cloudy weather', 9: 'Other'}
data['atm_name'] = data.atm.apply(lambda x: num2atm[x])

num2severity = {1: 'Unscathed ', 2: 'Killed', 3: 'Hospitalized wounded', 4: 'Light injury'}
data['severity_name'] = data.severity.apply(lambda x: num2severity[x])

num2lum = {1: 'Full day', 2: 'Twilight or dawn', 3: 'Night without public lighting', 4:'Night with public lighting not lit', 5:'Night with public lighting on'}
data['lum_name'] = data.lum.apply(lambda x: num2lum[x])

num2sex = {1: 'Male', 2: 'Female'}
data['gender'] = data.gender.apply(lambda x: num2sex[x])

num2userCategory = {1: 'Driver', 2: 'Passenger', 3:'Pedestrian', 4:'Skateboarder or scooter'}
data['User category'] = data['User category'].apply(lambda x: num2userCategory[x])

num2tripPurpose = {0: 'Not denoted', 1: 'Home - work', 2: 'Home - school', 3:'Shopping', 4:'Professional use', 5:'Walk - leisure', 9:'Other'}
data['trip purpose'] = data['trip purpose'].apply(lambda x: num2tripPurpose[x])

num2safety = {1: 'seatbelt', 2:'helmet', 11:'seatbelt yes', 12:'seatbelt no',13:'seatbelt not denoted', 21:'helmet yes', 22:'helmet no', 23:'helmet not denoted', 3:'Children device',31:'Children device yes', 32:'Children device no', 33:'Children device not denoted', 41:'Reflective equipment yes',42:'Reflective equipment no',43:'Reflective equipment not denoted',91:'Other yes',92:'Other no', 93:'Other not denoted'}
data['safety'] = data['safety'].apply(lambda x: num2safety[x])

num2prof = {0: 'Not denoted', 1: 'Straight', 2: 'Slope', 3:'Hilltop', 4:'Coastline'}
data['prof'] = data['prof'].apply(lambda x: num2prof[x])

num2plan = {0: 'Not denoted', 1: 'Rectilinear part', 2: 'Curving to the left', 3:'Curving to the right', 4:'S curve'}
data['plan'] = data['plan'].apply(lambda x: num2plan[x])

num2surfaceCondition = {0: 'Not denoted', 1: 'Normal', 2: 'Wet', 3:'puddles', 4:'flooded',5:'snowy',6:'mud',7:'icy',8:'fats - oil',9:'Other'}
data['surface condition'] = data['surface condition'].apply(lambda x: num2surfaceCondition[x])

num2infrastructure = {0: 'Not denoted', 1: 'Underground - tunnel', 2: 'Bridge - flyover', 3:'Exchanger or connection sling', 4:'Track',5:'Arranged crossroads',6:'Pedestrian zone',7:'Toll zone'}
data['infrastructure'] = data['infrastructure'].apply(lambda x: num2infrastructure[x])

num2roadtype = {0: 'Not denoted', 1: 'Highway', 2: 'National road', 3:'Departmental road', 4:'Communal roads',5:'Outside the public network',6:'Parking lot open to public traffic',7:'Urban metropolis roads', 9:'Other'}
data['roadtype'] = data['roadtype'].apply(lambda x: num2roadtype[x])

num2col = {1:'Two vehicles - frontal', 2: 'Two vehicles - from the rear', 
            3:'Two vehicles - from the side', 4:'Three vehicles and more - in a chain', 
            5:'Three or more vehicles - multiple collisions', 6:'Other collision', 
            7:'No collision'}
#num2col = {1:'Two vehicles', 2: 'Two vehicles', 3:'Two vehicles', 4:'Three or more vehicles', 
#            5:'Three or more vehicles', 6:'Other collision', 7:'No collision'}
data['collision_type'] = data['collision_type'].apply(lambda x: num2col[x])
In [11]:
#np.sort(data.collision_type.unique())
data.collision_type.value_counts()
Out[11]:
Two vehicles - from the side                    180414
Other collision                                  84712
Two vehicles - from the rear                     84076
Three vehicles and more - in a chain             78643
Two vehicles - frontal                           67986
Three or more vehicles - multiple collisions     67240
No collision                                     19411
Name: collision_type, dtype: int64

Create vehicle categories from existing vehicle catgories

In [12]:
def crude_vc(cat):
    if cat in [1]:
        return 'bicycle'
    elif cat in [2, 3, 4, 5, 6, 30, 31, 32, 33, 34, 35, 36, 41, 42, 43]:
        return 'light vehicles'
    elif cat in [7, 8, 9, 10, 11, 12, 13, 14, 15]:
        return 'car'
    elif cat in [16, 17, 19, 21, 37, 38, 40]:
        return 'heavy vehicle'
    elif cat in [39]:
        return 'train'
    elif cat in [18, 20, 50, 60, 80, 99]:
        return 'other'
    else:
        return -1

data['crude_vc'] = data['vehicle category'].apply(lambda x: crude_vc(x))

Time dependency

In [13]:
data['date'] = pd.to_datetime(data.year.astype(str)+'/'+data.month.astype(str)+'/'+data.day.astype(str), format='%Y/%m/%d')
In [14]:
data.groupby('date')['Num_Acc'].count().plot(figsize=(16,8))
plt.title("Number of accidents over time",fontsize=22)
plt.axhline(y = np.mean(data.groupby('date')['Num_Acc'].count()), color = 'r', linestyle = '--')
Out[14]:
<matplotlib.lines.Line2D at 0x7fa208bcb250>
In [15]:
fig = px.line(data.groupby(['date'])['Num_Acc'].count(), x=data.groupby(['date'])['Num_Acc'].count().index, y='Num_Acc',labels={"x": "Date",  "Num_Acc": "Number of Accidents"})
fig.update_layout(
    title="Number of accident on a given date",
    xaxis_title="Date",
    yaxis_title="Number of accidents on a given date",
)
fig.update_traces(line_color='red')
fig.show()
In [16]:
data.groupby(['year','month'])['Num_Acc'].count().plot(kind='bar',figsize=(16,8))
plt.title("Number of accidents per month",fontsize=22)
Out[16]:
Text(0.5, 1.0, 'Number of accidents per month')
In [17]:
data.groupby(['year','month'])['Num_Acc'].count().plot(kind = "bar",figsize=(16,8))
plt.title("Number of accidents per month",fontsize=22)
Out[17]:
Text(0.5, 1.0, 'Number of accidents per month')
In [18]:
fig = px.bar(data.groupby(['month'])['Num_Acc'].count(), x=data.groupby(['month'])['Num_Acc'].count().index, y='Num_Acc',labels={"x": "Month",  "Num_Acc": "Number of Accidents"})
fig.update_layout(
    title="Number of reported accidents per month",
    xaxis_title="Month",
    yaxis_title="Number of accidents per month",
    xaxis = dict(
        tickmode = 'array',
        tickvals = [1,2,3,4,5,6,7,8,9,10,11,12],
        ticktext = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    )
)
fig.update_traces(marker_color='red')
fig.show()
In [19]:
#df_test = data.groupby(['lum','severity']).size()
df_test = pd.DataFrame({'count' : data.groupby( [ "lum_name", "severity_name"] ).size()}).reset_index()
In [20]:
pivot_df = df_test.pivot(index='lum_name', columns='severity_name', values='count')
axes = pivot_df.plot.bar(stacked=True, figsize=(10,7),rot=45)
axes.legend(loc='center right', bbox_to_anchor=(1.3, 0.45),fancybox=True, shadow=True)
plt.xticks(horizontalalignment="right")
plt.ylabel('Number of accidents')
plt.xlabel('Lum category')
Out[20]:
Text(0.5, 0, 'Lum category')
In [21]:
sev_norm = pd.DataFrame(data.groupby(['lum_name','severity_name']).size().unstack())
sev_norm = sev_norm.div(sev_norm.sum(axis=1), axis=0)
#sev_norm = sev_norm.transpose()
sev_norm
Out[21]:
severity_name Hospitalized wounded Killed Light injury Unscathed
lum_name
Full day 0.159574 0.020217 0.347395 0.472814
Night with public lighting not lit 0.150229 0.028440 0.386697 0.434633
Night with public lighting on 0.134234 0.011488 0.402655 0.451623
Night without public lighting 0.228309 0.055436 0.322382 0.393873
Twilight or dawn 0.175036 0.025413 0.339427 0.460124
In [22]:
axes = sev_norm.plot(kind='bar',stacked=True,rot=45,figsize=(16,8))
#axes = pivot_df.plot.bar(stacked=True, figsize=(10,7),rot=45)
axes.legend(loc='center right', bbox_to_anchor=(1.3, 0.45),fancybox=True, shadow=True)
plt.xticks(horizontalalignment="right")
plt.title("Normalized number of accidents per lum category",fontsize=20)
plt.ylabel("Relative frequency")
plt.xlabel("Lum category")
Out[22]:
Text(0.5, 0, 'Lum category')

When normalizing it can be seen that "Night without public light" is the category where most people are killed.

In [23]:
#data['HHMM'] = data.HHMM.apply(lambda x: '000'+str(x) if (len(str(x))==1) else '00'+str(x) if (len(str(x))==2) else '0'+str(x) if (len(str(x))==3) else str(x))
data['minute'] = data.HHMM.apply(lambda x: x[2:])
data['hour'] = data.HHMM.apply(lambda x: x[:2])
In [24]:
plt.figure(figsize=(20,8))
plt.subplot(1,2,1)
plt.title('Number of reported accidents per hour')
data.hour.value_counts().sort_index().plot(kind='bar',rot=0)
plt.xlabel('Hour of the day')
plt.ylabel('Number of reported accidents')
plt.subplot(1,2,2)
plt.title('Number of reported accidents per minute')
data.minute.value_counts().sort_index().plot(kind='bar',rot=0,xticks=np.arange(0, 60, 5))
plt.xlabel('Minute of the day')
plt.ylabel('Number of reported accidents')
plt.show();

Graphs used for presentation

In [25]:
#plt.figure(figsize=(10,8))
#plt.title('Number of reported accidents per hour', fontsize = 32)
#data.hour.value_counts().sort_index().plot(kind='bar',rot=0)
#plt.xlabel('Hour of the day', fontsize = 29)
#plt.ylabel('Number of reported accidents', fontsize = 29)
#plt.show();

Interactive bar charts

In [26]:
fig = px.bar(data.groupby(['hour'])['Num_Acc'].count(), x=data.groupby(['hour'])['Num_Acc'].count().index, y='Num_Acc',labels={"x": "Hour of day",  "Num_Acc": "Number of Accidents"})
fig.update_layout(
    title="Number of reported accidents per hour of the day",
    xaxis_title="Hour of the day",
    yaxis_title="Number of accidents per hour of the day",
    xaxis = dict(
        tickmode = 'array',
        tickvals = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23],
    #    ticktext = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    )
)
fig.update_traces(marker_color='red')
fig.show()
In [27]:
fig = px.bar(data.groupby(['minute'])['Num_Acc'].count(), x=data.groupby(['minute'])['Num_Acc'].count().index, y='Num_Acc',labels={"x": "Minute of the day",  "Num_Acc": "Number of Accidents"})
fig.update_layout(
    title="Number of reported accidents per minute of the day",
    xaxis_title="Minute of the day",
    yaxis_title="Number of accidents per minute of the day",
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 5
    )
)
fig.update_traces(marker_color='red')
fig.show()
In [28]:
fig = px.bar(data.groupby(['age'])['Num_Acc'].count(), x=data.groupby(['age'])['Num_Acc'].count().index, y='Num_Acc',labels={"x": "Age of victim",  "Num_Acc": "Number of Accidents"})
fig.update_layout(
    title="Number of victims with a given age",
    xaxis_title="Age of victim",
    yaxis_title="Number of victims with a given age",
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 5
    )
)
fig.update_traces(marker_color='red')
fig.show()
In [29]:
#plt.figure(figsize=(10,7))
#plt.title('Weather conditions', fontsize = 32)
#data['atm_name'].value_counts().sort_index().plot(kind='bar',rot=45)
#plt.xlabel('Atmospheric condition', fontsize = 29)
#plt.ylabel('Reported accidents in a given weather', fontsize = 23)
#plt.xticks(np.arange(0, int(data.age.max()), step=5))
#plt.show();
In [30]:
fig = px.bar(data.groupby(['atm_name'])['Num_Acc'].count(), x=data.groupby(['atm_name'])['Num_Acc'].count().index, y='Num_Acc',labels={"x": "Atmospheric condition",  "Num_Acc": "Number of Accidents"})
fig.update_layout(
    title="Number of accident under a given atmospheric conditions",
    xaxis_title="Atmospheric condition",
    yaxis_title="Number of accidents under a given atmospheric condition",
)
fig.update_traces(marker_color='red')
fig.show()
In [31]:
# Make age into age groups
bins = [-1, 9, 19, 29, 39, 49, 59 ,69 , 79, 89, np.inf]
labels = ['0-9',"10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80-89", "90+"]
data['age_group'] = pd.cut(data['age'],bins,labels=labels)
data['gender_agegroup'] = data.gender + ' ' + data.age_group.astype(str)
data.groupby('gender_agegroup').size().plot(kind='bar',rot=45,figsize=(12,8))
Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa206a17710>

Folium map

In [32]:
from IPython.core.display import display, HTML
import tempfile

# Reference til kode: 
def folium_deepnote_show(m):
    tmp_output_filename = tempfile.NamedTemporaryFile(suffix='.html').name
    m.save(tmp_output_filename)

    f = open(tmp_output_filename, "r")
    data = f.read()
    data_fixed_width = data.replace('width: 100%;height: 100%', 'width: 100%').replace('height: 100.0%;', 'height: 609px;')
    display(HTML(data_fixed_width))
In [33]:
lat_lon = data[['longS','latS','severity','date','severity_name','gender','age','lum_name','atm_name','roadtype','traffic lanes',
'surface condition','crude_vc','collision_type']].sample(10000)
locations = lat_lon[['latS','longS']]
locationlist = locations.values.tolist()
In [34]:
print('''this is {}
that is {}'''.format('help',10))
this is help
that is 10
In [35]:
from IPython.display import display
from folium.plugins import FastMarkerCluster

# Possible styles for folium, last do not work properly
t_list = ["Stamen Terrain", "Stamen Toner", "Mapbox Bright", 'Cartodb Positron']

# Define the map with coordinates, style and zoom
m = folium.Map(location=[47.092038, 2.392312], 
                    tiles = t_list[3],
                    zoom_start = 6)

# Define colours
colours = ['lightgreen','black','darkred','orange']
# Unscathed, killed, hospitalized, light injury

# Add markers for each accident
for point in range(0, len(locationlist[:1000])):
    html = '''
    Date: {}<br>
    Severity: {}<br>
    Gender: {}<br>
    Age: {}<br>
    Lumination: {}<br>
    Atmosphere: {}<br>
    Roadtype: {}<br>
    Traffic lanes: {}<br>
    Surface conditions: {}<br>
    Vehicle type: {}<br>
    Collision type: {}
    '''.format(lat_lon.date.iloc[point],lat_lon.severity_name.iloc[point],lat_lon.gender.iloc[point],lat_lon.age.iloc[point],
    lat_lon.lum_name.iloc[point],lat_lon.atm_name.iloc[point],lat_lon.roadtype.iloc[point],lat_lon['traffic lanes'].iloc[point],
    lat_lon['surface condition'].iloc[point],lat_lon.crude_vc.iloc[point],lat_lon.collision_type.iloc[point])
    iframe = folium.IFrame(html,width=300,height=300)
    popup = folium.Popup(iframe,max_width=500)
    folium.CircleMarker(locationlist[point], radius=2, popup=popup, color=colours[lat_lon['severity'].iloc[point]-1]).add_to(m)

# Add clusters of number of accidents in areas
##m.add_child(FastMarkerCluster(locations[['lat', 'long']].values.tolist()))

# Display map
#folium_deepnote_show(m)
m
Out[35]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [36]:
lat_lon = data[['longS','latS','severity','date','severity_name','gender','age','lum_name','atm_name','roadtype','traffic lanes',
'surface condition','crude_vc','collision_type']]
lat_lon_special = lat_lon[lat_lon.date == '2016-12-20']
locations = lat_lon_special[['latS','longS']]
locationlist = locations.values.tolist()
In [37]:
from IPython.display import display
from folium.plugins import FastMarkerCluster

# Possible styles for folium, last do not work properly
t_list = ["Stamen Terrain", "Stamen Toner", "Mapbox Bright", 'Cartodb Positron']

# Define the map with coordinates, style and zoom
m = folium.Map(location=[47.092038, 2.392312], 
                    tiles = t_list[3],
                    zoom_start = 6)

# Define colours
colours = ['lightgreen','black','darkred','orange']
# Unscathed, killed, hospitalized, light injury

# Add markers for each accident

#for point in range(0, len(locationlist)):
#    html = '''
#    Date: {}<br>
#    Severity: {}<br>
#    Gender: {}<br>
#    Age: {}<br>
#    Lumination: {}<br>
#    Atmosphere: {}<br>
#    Roadtype: {}<br>
#    Traffic lanes: {}<br>
#    Surface conditions: {}<br>
#    Vehicle type: {}<br>
#    Collision type: {}
#    '''.format(lat_lon_special.date.iloc[point],lat_lon_special.severity_name.iloc[point],lat_lon_special.gender.iloc[point],lat_lon_special.age.iloc[point],
#    lat_lon_special.lum_name.iloc[point],lat_lon_special.atm_name.iloc[point],lat_lon_special.roadtype.iloc[point],lat_lon_special['traffic lanes'].iloc[point],
#    lat_lon_special['surface condition'].iloc[point],lat_lon_special.crude_vc.iloc[point],lat_lon_special.collision_type.iloc[point])
#    iframe = folium.IFrame(html,width=300,height=300)
#    popup = folium.Popup(iframe,max_width=500)
#    folium.CircleMarker(locationlist[point], radius=2, popup=popup, color=colours[lat_lon['severity'].iloc[point]-1]).add_to(m)
#'''
# Add clusters of number of accidents in areas
#m.add_child(FastMarkerCluster(locations[['latS', 'longS']].values.tolist()))

# Display map
#folium_deepnote_show(m)
m
Out[37]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [38]:
lat_lon_special.groupby(['longS','latS']).size().sort_values()
Out[38]:
longS     latS    
 4.35274  48.96741       1
-0.20847  47.48568       1
 2.44394  48.81720       1
 1.29486  45.88130       1
 4.06501  46.07005       1
                      ... 
 7.76317  48.56948      14
-1.59951  49.60690      16
 1.75034  49.94483      20
 4.01599  49.27246      24
-1.60176  46.62032    2205
Length: 86, dtype: int64
In [39]:
sns.scatterplot(data=data,x='long',y='lat')
Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa20373da50>
In [40]:
import matplotlib.cm as cm
# Stjålet fra github-fyrene
def set_color(data, col) :
    categories = np.unique(data[col])
    colors = cm.rainbow(np.linspace(0, 1, len(categories)))
    colordict = dict(zip(categories, colors))
    #print(colordict)
    return data[col].apply(lambda x: colordict[x])
In [41]:
#categories = np.unique(data['severity'])
#colors = cm.rainbow(np.linspace(0, 1, len(categories)))
#colordict = dict(zip(categories, colors))
#data['color'] = colordict[data['severity']]
In [42]:
# Gravity of the accident


#cm = plt.cm.get_cmap('RdYlBu')
plt.figure(figsize=(10,8))
sc = plt.scatter(data['long'], data['lat'],c = set_color(data,'severity'), s=0.1,vmin=data.severity.min(), vmax=data.severity.max())#,cmap = cm.rainbow)
plt.title('Location of the accident, depending on gravity')
plt.colorbar(sc)
plt.show()
In [43]:
set_color(data, 'severity')
Out[43]:
0         [0.8333333333333333, 0.8660254037844388, 0.500...
1         [0.8333333333333333, 0.8660254037844388, 0.500...
2                                      [0.5, 0.0, 1.0, 1.0]
3                                      [0.5, 0.0, 1.0, 1.0]
4                                      [0.5, 0.0, 1.0, 1.0]
                                ...                        
249205    [1.0, 1.2246467991473532e-16, 6.12323399573676...
249206                                 [0.5, 0.0, 1.0, 1.0]
249207                                 [0.5, 0.0, 1.0, 1.0]
249208    [1.0, 1.2246467991473532e-16, 6.12323399573676...
249209    [1.0, 1.2246467991473532e-16, 6.12323399573676...
Name: severity, Length: 582482, dtype: object
In [44]:
#pip install shap

Prepare data for data analysis

In [45]:
features = ['User category','gender','trip purpose','safety','year','month','day','hour','minute','lum_name','atm_name','lat','long','roadtype','prof','plan','surface condition','infrastructure','crude_vc','collision_type']
target = ['severity_name']
help_df = data[['severity_name','User category','gender','trip purpose','safety','year','month','day','hour','minute','lum_name','atm_name','lat','long','roadtype','prof','plan','surface condition','infrastructure','crude_vc','collision_type']]
help_df
Out[45]:
severity_name User category gender trip purpose safety year month day hour minute ... atm_name lat long roadtype prof plan surface condition infrastructure crude_vc collision_type
0 Hospitalized wounded Driver Male Not denoted seatbelt yes 2018 1 24 15 05 ... Normal 5055737.0 294992.0 Departmental road Straight Curving to the right Normal Not denoted car Two vehicles - frontal
1 Hospitalized wounded Driver Male Not denoted seatbelt yes 2018 1 24 15 05 ... Normal 5055737.0 294992.0 Departmental road Straight Curving to the right Normal Not denoted car Two vehicles - frontal
2 Unscathed Driver Male Walk - leisure seatbelt yes 2018 1 24 15 05 ... Normal 5055737.0 294992.0 Departmental road Straight Curving to the right Normal Not denoted car Two vehicles - frontal
3 Unscathed Driver Male Walk - leisure seatbelt yes 2018 1 24 15 05 ... Normal 5055737.0 294992.0 Departmental road Straight Curving to the right Normal Not denoted car Two vehicles - frontal
4 Unscathed Driver Male Not denoted seatbelt yes 2018 2 12 10 15 ... Dazzling weather 5052936.0 293151.0 Communal roads Straight Curving to the left Normal Not denoted car No collision
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
249205 Light injury Driver Female Home - work seatbelt yes 2016 12 29 17 45 ... Normal 4887110.0 241310.0 Communal roads Straight Rectilinear part Not denoted Not denoted car Two vehicles - from the rear
249206 Unscathed Driver Male Home - work seatbelt yes 2016 12 29 17 45 ... Normal 4887110.0 241310.0 Communal roads Straight Rectilinear part Not denoted Not denoted car Two vehicles - from the rear
249207 Unscathed Driver Male Home - work seatbelt yes 2016 12 29 17 45 ... Normal 4887110.0 241310.0 Communal roads Straight Rectilinear part Not denoted Not denoted car Two vehicles - from the rear
249208 Light injury Driver Female Walk - leisure seatbelt yes 2016 12 30 14 50 ... Normal 4885850.0 241411.0 Communal roads Straight Rectilinear part Normal Not denoted car Other collision
249209 Light injury Driver Female Walk - leisure seatbelt yes 2016 12 30 14 50 ... Normal 4885850.0 241411.0 Communal roads Straight Rectilinear part Normal Not denoted car Other collision

582482 rows × 21 columns

In [46]:
# Reduce data size to not run out of memory
sample_size = len(help_df[help_df.severity_name == 'Killed']) 
print("Sample size:", sample_size)
Unscathed = help_df[help_df.severity_name == 'Unscathed '].sample(sample_size)
Light_injury = help_df[help_df.severity_name == 'Light injury'].sample(sample_size)
Hospitalized_wounded = help_df[help_df.severity_name == 'Hospitalized wounded'].sample(sample_size)
Killed = help_df[help_df.severity_name == 'Killed'].sample(sample_size)
data_balanced = pd.concat([Unscathed,Light_injury,Hospitalized_wounded,Killed],axis=0)
Sample size: 13174
In [47]:
data_balanced.head()
Out[47]:
severity_name User category gender trip purpose safety year month day hour minute ... atm_name lat long roadtype prof plan surface condition infrastructure crude_vc collision_type
91450 Unscathed Driver Male Home - work seatbelt yes 2018 6 25 15 20 ... Normal 5051772.0 306066.0 Departmental road Straight Rectilinear part Normal Not denoted light vehicles Two vehicles - from the side
141559 Unscathed Driver Male Walk - leisure seatbelt yes 2016 8 1 09 30 ... Normal 4854300.0 773656.0 Communal roads Straight Rectilinear part Normal Not denoted car Two vehicles - from the rear
10983 Unscathed Passenger Male Walk - leisure seatbelt yes 2017 6 3 19 25 ... Normal 4555817.0 595834.0 Departmental road Straight Rectilinear part Normal Not denoted light vehicles Two vehicles - from the side
188634 Unscathed Driver Male Walk - leisure seatbelt yes 2018 5 7 16 50 ... Normal 4895508.0 245813.0 Highway Straight Rectilinear part Normal Not denoted car Two vehicles - from the side
876 Unscathed Passenger Female Walk - leisure seatbelt yes 2018 10 4 17 45 ... Normal 5013038.0 150174.0 Departmental road Straight Rectilinear part Normal Not denoted light vehicles Three or more vehicles - multiple collisions

5 rows × 21 columns

One-hot encode categorical features

In [48]:
from scipy.fftpack import fft, dct

df_features = data_balanced[features]

user = pd.get_dummies(df_features['User category'])
df_features = pd.concat([df_features,user],axis=1)
purpose = pd.get_dummies(df_features['trip purpose'],prefix='Purpose')
df_features = pd.concat([df_features,purpose],axis=1)
safety = pd.get_dummies(df_features['safety'],prefix='Safety')
df_features = pd.concat([df_features,safety],axis=1)
prof = pd.get_dummies(df_features['prof'],prefix='prof')
df_features = pd.concat([df_features,prof],axis=1)
plan = pd.get_dummies(df_features['plan'],prefix='plan')
df_features = pd.concat([df_features,plan],axis=1)
surface = pd.get_dummies(df_features['surface condition'],prefix='surface')
df_features = pd.concat([df_features,surface],axis=1)
infrastructure = pd.get_dummies(df_features['infrastructure'],prefix='infra')
df_features = pd.concat([df_features,infrastructure],axis=1)
gender = pd.get_dummies(df_features.gender)
df_features = pd.concat([df_features,gender],axis=1)
lum = pd.get_dummies(df_features.lum_name,prefix='lum')
df_features = pd.concat([df_features,lum],axis=1)
atm = pd.get_dummies(df_features.atm_name,prefix='atm')
df_features = pd.concat([df_features,atm],axis=1)
roadtype = pd.get_dummies(df_features.roadtype,prefix='roadtype')
df_features = pd.concat([df_features,roadtype],axis=1)
crude = pd.get_dummies(df_features.crude_vc,prefix='vehtype')
df_features = pd.concat([df_features,crude],axis=1)
col_type = pd.get_dummies(df_features.collision_type,prefix='col')
df_features = pd.concat([df_features,col_type],axis=1)

df_features = df_features.drop(columns=['User category','trip purpose','safety','prof','plan','surface condition','infrastructure','lum_name','atm_name','gender','roadtype','crude_vc','collision_type'])
df_features
Out[48]:
year month day hour minute lat long Driver Passenger Pedestrian ... vehtype_light vehicles vehtype_other vehtype_train col_No collision col_Other collision col_Three or more vehicles - multiple collisions col_Three vehicles and more - in a chain col_Two vehicles - from the rear col_Two vehicles - from the side col_Two vehicles - frontal
91450 2018 6 25 15 20 5051772.0 306066.0 1 0 0 ... 1 0 0 0 0 0 0 0 1 0
141559 2016 8 1 09 30 4854300.0 773656.0 1 0 0 ... 0 0 0 0 0 0 0 1 0 0
10983 2017 6 3 19 25 4555817.0 595834.0 0 1 0 ... 1 0 0 0 0 0 0 0 1 0
188634 2018 5 7 16 50 4895508.0 245813.0 1 0 0 ... 0 0 0 0 0 0 0 0 1 0
876 2018 10 4 17 45 5013038.0 150174.0 0 1 0 ... 1 0 0 0 0 1 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2002 2017 7 7 12 45 4915252.0 134655.0 1 0 0 ... 0 0 0 0 0 1 0 0 0 0
19439 2018 10 26 15 47 4909220.0 205930.0 1 0 0 ... 0 0 0 0 0 0 0 0 1 0
91395 2018 2 8 08 30 4523917.0 149556.0 0 1 0 ... 0 0 0 0 0 0 0 0 0 1
14641 2017 6 26 08 30 4648585.0 398998.0 0 0 1 ... 0 0 0 0 1 0 0 0 0 0
42968 2016 11 21 07 45 4827000.0 148700.0 0 1 0 ... 0 0 0 0 1 0 0 0 0 0

52696 rows × 101 columns

Split in training and test set

In [49]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Split data
X = df_features 
X.hour = X.hour.astype(int)
X.minute = X.minute.astype(int)
y = data_balanced[target]
#x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

train_perc = 0.75 # percentage of training data
split_point = int(train_perc*len(y))
perm = np.random.permutation(len(y))
ix_train = perm[:split_point] # index of training data
ix_test = perm[split_point:] # index of test data
x_train = X.iloc[ix_train,:]
x_test = X.iloc[ix_test,:]
y_train = y.iloc[ix_train]
y_test = y.iloc[ix_test]
print("num train: %d" % len(y_train))
print("num test: %d" % len(y_test))
num train: 39522
num test: 13174

Baseline model - Logistic regression

In [50]:
# Define logistic regression and predict severity class

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

LogReg = LogisticRegression(multi_class='multinomial')
LogReg.fit(x_train, y_train)
y_pred = LogReg.predict(x_test)

acc_LogReg = round(accuracy_score(y_pred, y_test) * 100, 2)
print("Accuracy logistic regression:", acc_LogReg)

target_names = y.severity_name.unique() #np.unique(y.values)
print(classification_report(y_test, y_pred, target_names=target_names))

mat = confusion_matrix(y_test, y_pred)
sns.heatmap(mat, square=True, annot=True, cbar=True,fmt='g',cmap='Blues')
plt.xlabel('predicted value')
plt.ylabel('true value');
/Users/asgermunch/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:760: DataConversionWarning:

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().

Accuracy logistic regression: 29.57
/Users/asgermunch/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

                      precision    recall  f1-score   support

          Unscathed        0.28      0.33      0.30      3265
        Light injury       0.31      0.44      0.36      3300
Hospitalized wounded       0.30      0.43      0.35      3258
              Killed       0.00      0.00      0.00      3351

            accuracy                           0.30     13174
           macro avg       0.22      0.30      0.25     13174
        weighted avg       0.22      0.30      0.25     13174

XG Boost

In [51]:
import shap
In [52]:
!pip install xgboost
Requirement already satisfied: xgboost in /Users/asgermunch/opt/anaconda3/lib/python3.7/site-packages (1.4.0)
Requirement already satisfied: scipy in /Users/asgermunch/opt/anaconda3/lib/python3.7/site-packages (from xgboost) (1.4.1)
Requirement already satisfied: numpy in /Users/asgermunch/opt/anaconda3/lib/python3.7/site-packages (from xgboost) (1.18.1)
In [53]:
severity2num = {'Unscathed ': 1,'Killed': 2,'Hospitalized wounded': 3,'Light injury': 4}
y_train_num = y_train.severity_name.apply(lambda x: severity2num[x])
In [54]:
# Import XGBoost for classification
from xgboost import XGBClassifier

# Parameters for XG Boost - Note some of these are for regreession :(
xgb_params = {'base_score': 0.5, 'booster': 'gblinear', 'colsample_bylevel': 0.5, 
            'colsample_bynode': 0.5, 'gamma': 0, 'gpu_id': 0, 'interaction_constraints': None,
            'max_delta_step': 10, 'min_child_weight': 1, 'n_jobs': -1, 'num_parallel_tree': 1,
            'random_state': 42, 'reg_alpha': 0, 'reg_lambda': 10, 'scale_pos_weight': 1,
            'objective': 'multi:softmax', 'num_class': 4, 'colsample_bytree': 0.3, 'learning_rate': 0.1, 
            'max_depth': 3,'n_estimators': 200, 'lambda': 10, 'subsample': 0.5, 
            'tree_method': 'auto', 'validate_parameters': False, 'verbosity': 1,
            'interaction_constraints': False, 'monotone_constraints': False}

# Define XG Boost model
xgb = XGBClassifier()

# Set parameters
#xgb.set_params(**xgb_params)

xgb.fit(x_train,y_train_num)

# Calcuate the explaining model
#explainer = shap.Explainer(xgb)

# Get the shap values (values that explain the impact)
#shap_values = explainer(x_train)

# summarize the effects of all the features
#shap.plots.beeswarm(shap_values)
#shap.plots.beeswarm(shap_values.sum(axis=2))
/Users/asgermunch/opt/anaconda3/lib/python3.7/site-packages/xgboost/sklearn.py:1146: UserWarning:

The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].

[15:13:13] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Out[54]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)
In [55]:
#shap_values.values.sum(axis=2).shape
#shap.plots.beeswarm(shap_values.sum(axis=2))
In [56]:
#shap.summary_plot(shap_values[:,:,1], x_train)
In [57]:
#shap.summary_plot(shap_values.sum(axis=2), x_train, plot_type="bar")
In [58]:
y_pred = xgb.predict(x_test)
/Users/asgermunch/opt/anaconda3/lib/python3.7/site-packages/xgboost/data.py:114: UserWarning:

Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption

In [59]:
y_pred = pd.DataFrame(y_pred,columns=['severity_num']).severity_num.apply(lambda x: num2severity[x])
In [60]:
acc_xgb = round(accuracy_score(y_pred, y_test) * 100, 2)
print("Accuracy XG Boost classifier:", acc_xgb)

target_names = y.severity_name.unique() #np.unique(y.values)
print(classification_report(y_test, y_pred, target_names=target_names))

mat = confusion_matrix(y_test, y_pred)
sns.heatmap(mat, square=True, annot=True, cbar=True,fmt='g',cmap='Blues')
plt.xlabel('predicted value')
plt.ylabel('true value');
Accuracy XG Boost classifier: 56.14
                      precision    recall  f1-score   support

          Unscathed        0.48      0.43      0.45      3265
        Light injury       0.58      0.66      0.61      3300
Hospitalized wounded       0.55      0.48      0.51      3258
              Killed       0.62      0.67      0.65      3351

            accuracy                           0.56     13174
           macro avg       0.56      0.56      0.56     13174
        weighted avg       0.56      0.56      0.56     13174

In [61]:
for col in x_train.columns:
    print(col)
year
month
day
hour
minute
lat
long
Driver
Passenger
Pedestrian
Skateboarder or scooter
Purpose_Home - school
Purpose_Home - work
Purpose_Not denoted
Purpose_Other
Purpose_Professional use
Purpose_Shopping
Purpose_Walk - leisure
Safety_Children device
Safety_Children device no
Safety_Children device not denoted
Safety_Children device yes
Safety_Other no
Safety_Other not denoted
Safety_Other yes
Safety_Reflective equipment no
Safety_Reflective equipment not denoted
Safety_Reflective equipment yes
Safety_helmet
Safety_helmet no
Safety_helmet not denoted
Safety_helmet yes
Safety_seatbelt
Safety_seatbelt no
Safety_seatbelt not denoted
Safety_seatbelt yes
prof_Coastline
prof_Hilltop
prof_Not denoted
prof_Slope
prof_Straight
plan_Curving to the left
plan_Curving to the right
plan_Not denoted
plan_Rectilinear part
plan_S curve
surface_Normal
surface_Not denoted
surface_Other
surface_Wet
surface_fats - oil
surface_flooded
surface_icy
surface_mud
surface_puddles
surface_snowy
infra_Arranged crossroads
infra_Bridge - flyover
infra_Exchanger or connection sling
infra_Not denoted
infra_Pedestrian zone
infra_Toll zone
infra_Track
infra_Underground - tunnel
Female
Male
lum_Full day
lum_Night with public lighting not lit
lum_Night with public lighting on
lum_Night without public lighting
lum_Twilight or dawn
atm_Cloudy weather
atm_Dazzling weather
atm_Fog - smoke
atm_Heavy Rain
atm_Light Rain
atm_Normal 
atm_Other
atm_Snow - hail
atm_Strong wind - storm
roadtype_Communal roads
roadtype_Departmental road
roadtype_Highway
roadtype_National road
roadtype_Other
roadtype_Outside the public network
roadtype_Parking lot open to public traffic
roadtype_Urban metropolis roads
vehtype_bicycle
vehtype_car
vehtype_heavy vehicle
vehtype_light vehicles
vehtype_other
vehtype_train
col_No collision
col_Other collision
col_Three or more vehicles - multiple collisions
col_Three vehicles and more - in a chain
col_Two vehicles - from the rear
col_Two vehicles - from the side
col_Two vehicles - frontal
In [62]:
feat_imp = {key: val for key,val in zip(x_train.columns,xgb.feature_importances_)}
In [63]:
feature_importance = pd.DataFrame(feat_imp,index=['feature_importance']).transpose()
feature_importance.feature_importance.sort_values(ascending=False)[:10].plot(kind='bar',figsize=(8,6))
plt.xlabel('Feature')
plt.ylabel('Percentage importance')
plt.title('Feature importance - 10 most important features');
In [64]:
from xgboost import plot_importance
plot_importance(xgb)
plt.show()

#print(len(xgb.feature_importances_))
plt.bar(range(len(xgb.feature_importances_)),xgb.feature_importances_)
plt.show()
In [65]:
fig = px.bar(data.groupby(['month'])['Num_Acc'].count(), x=data.groupby(['month'])['Num_Acc'].count().index, y='Num_Acc',labels={"x": "Month",  "Num_Acc": "Number of Accidents"})
fig.update_layout(
    title="Number of reported accidents per month",
    xaxis_title="Month",
    yaxis_title="Number of accidents per month",
    xaxis = dict(
        tickmode = 'array',
        tickvals = [1,2,3,4,5,6,7,8,9,10,11,12],
        ticktext = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    )
)
fig.show()
In [ ]: